# Computations
import numpy as np
import pandas as pd
import re
from sklearn import preprocessing
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## progressbar
import progressbar
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we use Kaggle'sPima Indians Diabetes. The Pima Indians are a group of Native Americans living in an area consisting of what is now central and southern Arizona. A variety of statistical methods are used here for predictions.
This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.
The datasets consist of several medical predictor variables and one target variable, Outcome. Predictor variables include the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.
Path = 'pima-indians-diabetes-database/diabetes.csv'
Data = pd.read_csv(Path)
Data.columns = [re.sub(r"(\w)([A-Z])", r"\1 \2", x).replace('B MI','BMI') for x in Data.columns]
display(Data.head())
display(pd.DataFrame({'Number of Instances': [Data.shape[0]], 'Number of Attributes': [Data.shape[1]]}).style.hide_index())
Target = 'Outcome'
Labels = ['Non-Diabetic', 'Diabetic']
| Pregnancies | Glucose | Blood Pressure | Skin Thickness | Insulin | BMI | Diabetes Pedigree Function | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
| Number of Instances | Number of Attributes |
|---|---|
| 768 | 9 |
| Feature | Explanations |
|---|---|
| Pregnancies | Number of times pregnant |
| Glucose | Plasma glucose concentration a 2 hours in an oral glucose tolerance test |
| BloodPressure | Diastolic blood pressure (mm Hg) |
| SkinThickness | Triceps skinfold thickness (mm) |
| Insulin | 2-Hour serum insulin (mu U/ml) |
| BMI | Body mass index (weight in kg/(height in m)^2) |
| DiabetesPedigreeFunction | Diabetes pedigree function |
| Age | Age (years) |
| Outcome | Whether or not a patient has diabetes |
def Data_Plot(Inp, W = False):
data_info = Inp.copy()
data_info = Inp.dtypes.astype(str).to_frame(name='Data Type')
Temp = Inp.isnull().sum().to_frame(name = 'Number of NaN Values')
data_info = data_info.join(Temp, how='outer')
data_info ['Size'] = Inp.shape[0]
data_info['Percentage'] = 100 - np.round(100*(data_info['Number of NaN Values']/Inp.shape[0]),2)
data_info = data_info.reset_index(drop = False).rename(columns = {'index':'Features'})
#
fig = px.bar(data_info, x= 'Features', y= 'Percentage', color = 'Data Type', text = 'Data Type',
color_discrete_sequence = ['PaleGreen', 'LightCyan', 'PeachPuff', 'Pink', 'Plum'],
hover_data = data_info.columns)
fig.update_layout(plot_bgcolor= 'white', legend=dict(x=1.01, y=.5, traceorder="normal",
bordercolor="DarkGray", borderwidth=1))
fig.update_traces(texttemplate= 6*' ' + '%{label}', textposition='inside')
fig.update_traces(marker_line_color= 'Black', marker_line_width=1., opacity=1)
if W:
fig.update_layout(width = W)
fig.update_layout(title={'text': '<b>' + 'Dataset Overview' + '<b>', 'x':0.5,
'y': 0.92, 'xanchor': 'center', 'yanchor': 'top'}, yaxis_title='Frequency')
fig.show()
return data_info
_ = Data_Plot(Data, W = 800)
Let's take a close look at our data.
fig, ax = plt.subplots(nrows=4, ncols=2, figsize = (16, 20))
ax = ax.ravel()
for i in range(len(Data.columns[:-1])):
sns.distplot(Data.iloc[:,i], rug=True, rug_kws={"color": "red"},
kde_kws={"color": "k", "lw": 2, "label": "KDE"}, hist_kws={"histtype": "step", "linewidth": 2,
"alpha": 1, "color": "Navy"}, ax= ax[i])
Temp = ['Non-Diabetic' if x==0 else 'Diabetic' for x in Data['Outcome']]
fig = go.Figure(data=go.Splom(dimensions=[dict(label='Pregnancies', values=Data['Pregnancies']),
dict(label='Glucose', values=Data['Glucose']),
dict(label='Blood<br>Pressure', values=Data['Blood Pressure']),
dict(label='Skin<br>Thickness', values=Data['Skin Thickness']),
dict(label='Insulin', values=Data['Insulin']),
dict(label='BMI', values=Data['BMI']),
dict(label='Diabetes<br>Pedigree<br>Fun', values=Data['Diabetes Pedigree Function']),
dict(label='Age', values=Data['Age'])],
showupperhalf=False,
marker=dict(color=Data['Outcome'], size=4, colorscale='Bluered',
line=dict(width=0.4, color='black')),
text=Temp, diagonal=dict(visible=False)))
del Temp
fig.update_layout(title='Scatterplot Matrix', dragmode='select',
width=900, height=900, hovermode='closest')
fig.show()
As can be seen, the Data has a normal distribution, and some entries need to be adjusted. In doing so, we defined a normalizer as follows, for a given vector $x$,
\begin{align*} \text{Normalizer}(x, cut) = \begin{cases} x_i &\mbox{if } |x_i- \mu|<\sigma\times cut \\ mode(x) & \mbox{else} \end{cases}. \end{align*}def Normalizer(Col, cut = 3):
return Col[(Col > (Col.mean() - Col.std() * cut)) &
(Col < (Col.mean() + Col.std() * cut))]
# Normalized Data
df = Data.copy()
fig, ax = plt.subplots(nrows=4, ncols=2, figsize = (16, 20))
ax = ax.ravel()
for i in range(len(df.columns[:-1])):
df[df.columns[i]] = Normalizer(Data[Data.columns[i]])
df[df.columns[i]] = df[df.columns[i]].fillna(df[df.columns[i]].dropna().mode()[0])
# Sub-Plots
sns.distplot(df.iloc[:,i], rug=True, rug_kws={"color": "red"}, kde_kws={"color": "k", "lw": 2, "label": "KDE"},
hist_kws={"histtype": "step", "linewidth": 2, "alpha": 1, "color": "Navy"}, ax= ax[i])
Basically, we diminished the influence of certain data points (see the following figure).
Temp = Data.copy()
Temp.iloc[:,:-1] = abs(Data.iloc[:,:-1] - df.iloc[:,:-1])
Temp0 = ['Non-Diabetic' if x==0 else 'Diabetic' for x in Temp['Outcome']]
fig = go.Figure(data=go.Splom(dimensions=[dict(label='Pregnancies', values=Temp['Pregnancies']),
dict(label='Glucose', values=Temp['Glucose']),
dict(label='Blood<br>Pressure', values=Temp['Blood Pressure']),
dict(label='Skin<br>Thickness', values=Temp['Skin Thickness']),
dict(label='Insulin', values=Temp['Insulin']),
dict(label='BMI', values=Temp['BMI']),
dict(label='Diabetes<br>Pedigree<br>Fun', values=Temp['Diabetes Pedigree Function']),
dict(label='Age', values=Temp['Age'])],
showupperhalf=False,
marker=dict(color=Temp['Outcome'], size=4, colorscale='Bluered',
line=dict(width=0.4, color='black')),
text=Temp0, diagonal=dict(visible=False)))
del Temp, Temp0
fig.update_layout(title='Scatterplot Matrix', dragmode='select',
width=900, height=900, hovermode='closest')
fig.show()
def Correlation_Plot (Inp, Fig_Size = 12, annot_kws = 11):
Correlation_Matrix = Inp.corr().round(2)
mask = np.zeros_like(Correlation_Matrix)
mask[np.triu_indices_from(mask)] = True
for i in range(len(mask)):
mask[i,i]=0
fig, ax = plt.subplots(figsize=(Fig_Size, Fig_Size))
sns.heatmap(Correlation_Matrix, ax=ax, mask=mask, annot=True, square=True,
cmap =sns.color_palette("Greens", n_colors=10), linewidths = 0.2, vmin=0, vmax=1,
cbar_kws={'label': 'Correlation', "aspect":30, "shrink": .4}, annot_kws={"size": annot_kws})
return Correlation_Matrix
_ = Correlation_Plot (df, 9)
| Pregnancies | Glucose | Blood Pressure | Skin Thickness | Insulin | BMI | Diabetes Pedigree Function | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| Pregnancies | 1.00 | 0.12 | 0.22 | -0.10 | -0.07 | 0.02 | 0.02 | 0.55 | 0.20 |
| Glucose | 0.12 | 1.00 | 0.22 | 0.05 | 0.22 | 0.23 | 0.10 | 0.27 | 0.49 |
| Blood Pressure | 0.22 | 0.22 | 1.00 | 0.02 | -0.06 | 0.26 | 0.03 | 0.33 | 0.16 |
| Skin Thickness | -0.10 | 0.05 | 0.02 | 1.00 | 0.45 | 0.37 | 0.15 | -0.12 | 0.06 |
| Insulin | -0.07 | 0.22 | -0.06 | 0.45 | 1.00 | 0.14 | 0.18 | -0.07 | 0.08 |
| BMI | 0.02 | 0.23 | 0.26 | 0.37 | 0.14 | 1.00 | 0.12 | 0.06 | 0.31 |
| Diabetes Pedigree Function | 0.02 | 0.10 | 0.03 | 0.15 | 0.18 | 0.12 | 1.00 | 0.07 | 0.19 |
| Age | 0.55 | 0.27 | 0.33 | -0.12 | -0.07 | 0.06 | 0.07 | 1.00 | 0.26 |
| Outcome | 0.20 | 0.49 | 0.16 | 0.06 | 0.08 | 0.31 | 0.19 | 0.26 | 1.00 |
df.to_csv (Path.split(".")[0]+'_mod.csv', index = None, header=True)
X = Data.drop(columns = [Target])
y = Data[Target]
Moreover, high variance for some features can hurt our modeling process. For this reason, we would like to standardize features by removing the mean and scaling to unit variance.
X = pd.DataFrame(data = X, columns = X.columns)
# scaling data
scaler = preprocessing.StandardScaler()
X_std = scaler.fit_transform(X)
X_std = pd.DataFrame(data = X_std, columns = X.columns)
del scaler
fig, ax = plt.subplots(2, 1, figsize=(12, 8))
ax = ax.ravel()
font = FontProperties()
font.set_weight('bold')
CP = [sns.color_palette("OrRd", 20), sns.color_palette("Greens", X.shape[1])]
Names = ['Variance of the Features', 'Variance of the Features (Standardized)']
Sets = [X, X_std]
kws = dict(label='Feature\nVariance', aspect=10, shrink= .3)
for i in range(len(ax)):
Temp = Sets[i].var().sort_values(ascending = False).to_frame(name= 'Variance').round(2).T
_ = sns.heatmap(Temp, ax=ax[i], annot=True, square=True, cmap = CP[i],
linewidths = 0.8, vmin=0, vmax=Temp.max(axis =1)[0], annot_kws={"size": 14},
cbar_kws=kws)
_ = ax[i].set_yticklabels('')
_ = ax[i].set_title(Names[i], fontproperties=font, fontsize = 14)
del Temp
df[X.columns] = X_std.copy()
df.to_csv (Path.split(".")[0]+'_STD.csv', index = None, header=True)
del CP, Names, ax, fig, font, Sets, kws, X_std